{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "745cff4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import Tokenizer, models, Encoding, pre_tokenizers, decoders, Regex, CharBPETokenizer, trainers\n",
    "import json\n",
    "\n",
    "\n",
    "with open(\"../../config/symbols.json\", \"r\") as f:\n",
    "    vocab_list: list[str] = json.loads(f.read())\n",
    "special_tokens = [\"<PAD>\", \"<SOS>\", \"<EOS>\", \"<UNK>\", \"<MASK>\"]\n",
    "tokenizer: Tokenizer = CharBPETokenizer(unk_token=\"<UNK>\")\n",
    "\n",
    "tokenizer.add_special_tokens(special_tokens)\n",
    "tokenizer.add_tokens(special_tokens)\n",
    "tokenizer.add_tokens(vocab_list)\n",
    "\n",
    "trainer = trainers.WordLevelTrainer(special_tokens=special_tokens)\n",
    "\n",
    "tokenizer.save(\"../../config/input_tokenizer.json\")\n",
    "with open(\"../../config/input_tokenizer.json\", \"r\") as f:\n",
    "    obj = json.loads(f.read())\n",
    "    v: dict[str, int] = tokenizer.get_vocab()\n",
    "    obj[\"model\"][\"vocab\"] = v\n",
    "with open(\"../../config/input_tokenizer.json\", \"w\") as f:\n",
    "    f.write(json.dumps(obj, ensure_ascii=False, indent=2))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bad73eb9",
   "metadata": {},
   "source": [
    "### Load generated token."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "64fe8cac",
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer: Tokenizer = Tokenizer.from_file(\"../../config/input_tokenizer.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "64e5ece1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ids: [2, 1, 0, 51, 3, 68, 75, 75, 78, 36, 5, 12, 18, 35, 26, 30, 17, 35, 15, 39, 51, 21, 5, 27, 13, 5, 21, 51, 37, 6, 28, 33, 5, 25, 35, 24, 35, 15, 40, 51, 37, 5, 36, 13, 35, 24, 25, 5, 18, 9, 30, 14, 5, 25, 26, 5, 36, 33, 5, 9, 21, 5, 27, 25, 35, 21, 35, 24, 35, 15, 13, 35, 27, 26, 35, 22, 35, 27, 9, 5, 24, 51, 37, 5, 36, 13, 35, 24, 25, 5, 18, 9, 30, 14, 5, 25, 26, 5, 36, 33, 5, 9, 21, 5, 27, 13, 6, 22, 51, 21, 35, 25, 35, 17, 35, 15, 90]\n",
      "encoded: ['<EOS>', '<SOS>', '<PAD>', ' ', '<UNK>', 'e', 'l', 'l', 'o', 'ي', 'ا', 'خ', 'ش', 'ى', 'م', 'ۇ', 'س', 'ى', 'ز', '،', ' ', 'ق', 'ا', 'ن', 'د', 'ا', 'ق', ' ', 'ئ', 'ە', 'ھ', 'ۋ', 'ا', 'ل', 'ى', 'ڭ', 'ى', 'ز', '؟', ' ', 'ئ', 'ا', 'ي', 'د', 'ى', 'ڭ', 'ل', 'ا', 'ش', 'ت', 'ۇ', 'ر', 'ا', 'ل', 'م', 'ا', 'ي', 'ۋ', 'ا', 'ت', 'ق', 'ا', 'ن', 'ل', 'ى', 'ق', 'ى', 'ڭ', 'ى', 'ز', 'د', 'ى', 'ن', 'م', 'ى', 'ك', 'ى', 'ن', 'ت', 'ا', 'ڭ', ' ', 'ئ', 'ا', 'ي', 'د', 'ى', 'ڭ', 'ل', 'ا', 'ش', 'ت', 'ۇ', 'ر', 'ا', 'ل', 'م', 'ا', 'ي', 'ۋ', 'ا', 'ت', 'ق', 'ا', 'ن', 'د', 'ە', 'ك', ' ', 'ق', 'ى', 'ل', 'ى', 'س', 'ى', 'ز', '.']\n",
      "len of tokens: 117\n",
      "Source : <EOS><SOS><PAD> Helloياخشىمۇسىز، قانداق ئەھۋالىڭىز؟ ئايدىڭلاشتۇرالمايۋاتقانلىقىڭىزدىنمىكىنتاڭ ئايدىڭلاشتۇرالمايۋاتقاندەك قىلىسىز.\n",
      "Decoded: <EOS><SOS><PAD> <UNK>elloياخشىمۇسىز، قانداق ئەھۋالىڭىز؟ ئايدىڭلاشتۇرالمايۋاتقانلىقىڭىزدىنمىكىنتاڭ ئايدىڭلاشتۇرالمايۋاتقاندەك قىلىسىز.\n"
     ]
    }
   ],
   "source": [
    "text = \"<EOS><SOS><PAD> Helloياخشىمۇسىز، قانداق ئەھۋالىڭىز؟ ئايدىڭلاشتۇرالمايۋاتقانلىقىڭىزدىنمىكىنتاڭ ئايدىڭلاشتۇرالمايۋاتقاندەك قىلىسىز.\"\n",
    "encoded: Encoding = tokenizer.encode(text)\n",
    "print(f\"ids: {encoded.ids}\")\n",
    "print(f\"encoded: {encoded.tokens}\")\n",
    "decoded: str = tokenizer.decode(encoded.ids)\n",
    "print(f\"len of tokens: {len(encoded.ids)}\")\n",
    "print(f\"Source : {text}\")\n",
    "print(f\"Decoded: {decoded}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "spell-correction",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
